Import Word Documents


In [1]:
# Set up path to files to be used for testing

test_data_path = "/Users/wbrierley/Documents/Jupyter Notebooks/Document Parsing Data/"
test_filename = "Brierley Bill Full CV 20180812.docx"
docxfile = test_data_path + test_filename

Word Documents are Archives of xml files


In [2]:
import zipfile

In [11]:
mydocx = zipfile.ZipFile(docxfile)
mydocx.namelist()


Out[11]:
['[Content_Types].xml',
 '_rels/.rels',
 'word/_rels/document.xml.rels',
 'word/document.xml',
 'word/footnotes.xml',
 'word/endnotes.xml',
 'word/header1.xml',
 'word/footer1.xml',
 'word/theme/theme1.xml',
 'word/settings.xml',
 'word/_rels/settings.xml.rels',
 'word/webSettings.xml',
 'word/styles.xml',
 'word/numbering.xml',
 'docProps/core.xml',
 'word/fontTable.xml',
 'docProps/app.xml']

In [18]:
with mydocx.open('[Content_Types].xml') as component:
    print(component.read().getroot()


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-18-11e1a26be2ff> in <module>
      1 with mydocx.open('[Content_Types].xml') as component:
----> 2     print(component.parse().getroot())

AttributeError: 'ZipExtFile' object has no attribute 'parse'

Processing the xml


In [ ]:
import xml.etree.ElementTree as ET

In [ ]:
def opendocx(file):
    '''Open a docx file, return a document XML tree'''
    mydoc = zipfile.ZipFile(file)
    xmlcontent = ET.parse(mydoc)
    #document = ET.fromstring(mydoc)
    return document

In [ ]:
doc_root = doc.getroot()

In [ ]: